Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd), SFARI genes with ensembl IDs (created in data_preprocessing.Rmd) and Neuronal annotations from Gene Ontology
Gene count by SFARI score:
table(DE_info$`gene-score`)
##
## 1 2 3 4 5 6 None
## 24 60 184 430 164 23 29222
GO Neuronal annotations:
print(glue(sum(GO_neuronal$ID %in% rownames(datExpr)), ' genes have neuronal-related annotations.'))
## 1134 genes have neuronal-related annotations.
print(glue(sum(SFARI_genes$ID %in% GO_neuronal$ID),' of these genes have a SFARI score.'))
## 206 of these genes have a SFARI score.
table(DE_info$Neuronal[DE_info$`gene-score` %in% c('1','2','3','4','5','6')],
DE_info$gene.score[DE_info$`gene-score` %in% c('1','2','3','4','5','6')])
##
## 1 2 3 4 5 6
## 0 15 44 144 357 128 19
## 1 9 16 40 73 36 4
The higher the SFARI score, the higher the mean expression of the gene but the lower the standard deviation. This is weird because the data was originally heteroscedastic with a positive relation between mean and variance, so the fact that the relation now seems to have reversed could mean that the vst normalisation ended up affecting the highly expressed genes more than it should have when trying to correct their higher variance.
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
Just to corroborate that the relation between sd and SFARI score used to be in the opposite direction before the normalisation: The higher the SFARI score the higher the mean expression and the higher the standard deviation
# Save preprocessed results
datExpr_prep = datExpr
datMeta_prep = datMeta
DE_info_prep = DE_info
load('./../Data/Gandal/filtered_raw_data.RData')
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
I think this reverse in the mean-SD relation happens when the variance grows at a slower rate than the mean, which was the case for this dataset:
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
fit = plot_data %>% lm(formula=SD~Mean)
m = round(fit$coefficients[[2]],2)
b = round(fit$coefficients[[1]],2)
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='#808080') +
scale_x_log10() + scale_y_log10() + coord_fixed() + theme_minimal() +
ggtitle(paste0('Linear fit with m=', m, ' and b=', b))
rm(plot_data, fit, m, b)
Return to normalised version of the data
# Save preprocessed results
datExpr = datExpr_prep
datMeta = datMeta_prep
DE_info = DE_info_prep
It seems there’s a negative relation between SFARI score and log fold change when it would be expected to be either positively correlated or independent from each other (because there are other factors that determine if a gene is releated to Autism apart from differences in gene expression)
Wikipedia mentions the likely explanation for this: “A disadvantage and serious risk of using fold change in this setting is that it is biased and may misclassify differentially expressed genes with large differences (B − A) but small ratios (B/A), leading to poor identification of changes at high expression levels”.
Based on this, since we saw there is a strong relation between SFARI score and mean expression, assuming the SFARI scores are reliable, log fold change should not be used as a metric to determine differential expression when studying autism.
On top of this, I believe this effect is made more extreme by the pattern found in the previous plots, since the higher expressed genes were the most affected by the normalisation transformation, ending up with a smaller variance than the rest of the data, which is related to smaller ratios. (This is a constant problem independently of the normalisation function used).
ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal() + theme(legend.position='none'))
The higher the SFARI score, the higher the percentage of genes that get filtered by differential expression.
This pattern is consistent for all log fold change thresholds
All SFARI scores except 6 are more affected by the filtering than the genes with Neuronal-related functional annotations
SFARI gene group 1 is more affected than the average gene, although the number of genes with SFARI score 1 is small, so this could be unreliable
At our recent threshold of log2(1.2), we lose all genes belonging to score 1 and 2
lfc_list = seq(1, 1.2, 0.01)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_info)))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_info$Neuronal)))
lfc_counts_all = DE_info %>% group_by(`gene-score`) %>% tally %>%
mutate('group'=`gene-score`, 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate DE_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=log2(lfc), altHypothesis='greaterAbs') %>% data.frame
DE_genes = DE_genes %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`))
DE_genes = DE_genes %>% filter(padj<0.05 & abs(log2FoldChange)>log2(lfc))
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Neuronal)))
lfc_counts = DE_genes %>% group_by(`gene-score`) %>% tally %>%
mutate('group'=`gene-score`, 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = DE_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='None') %>%
mutate('group'=`gene-score`, 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(DE_info$Neuronal)),
data.frame('group'='All', 'tot'=nrow(DE_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1, group!='None') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
scale_color_manual(values=SFARI_colour_hue(r=1:8)) + ylab('% of remaining genes') + xlab('Fold Change') +
ggtitle('Effect of filtering thresholds by SFARI score') + theme_minimal())
#rm(lfc_list,lfc, all_counts, Neuronal_counts, lfc_counts_all, DE_genes, lfc_counts)